clear
set more off
cap log close
set logtype text

/***** Package used: 
package name:  clus_nway.pkg
        from:  http://fmwww.bc.edu/RePEc/bocode/c/

*/

/* NOTE: This file must be run on the NBER server */

***** CHANGE TO OWN HOME DIRECTORY
global homedir "/homes/data/cens1940.work/olivetti/lsalisbu"

**** 

use $homedir/simulation_dataset_2_child.dta, clear

preserve
drop if sex==. | educ_attain==.
keep sex educ_attain
saveold "$homedir/dataset_child_for_R_2.dta", replace version(12)
restore

drop if sex==.

gen str first=word(name,1)
replace first=subinstr(first, ".", "", .)
replace first=trim(first)
replace first=proper(first)
	
*** obvious abbreviations (not nicknames)
replace first="William" if first=="Wm"
replace first="George" if first=="Geo"
replace first="Charles" if first=="Chas"
replace first="Daniel" if first=="Danl"
replace first="James" if first=="Jas"
replace first="Joseph" if first=="Jos"
replace first="Robert" if first=="Robt"
replace first="Richard" if first=="Richd"
replace first="Samuel" if first=="Saml"
replace first="Thomas" if first=="Thos"
replace first="Frederick" if first=="Fredk"
replace first="Frederick" if first=="Fred'K" 
replace first="John" if first=="Jno" 
replace first="Samuel" if first=="Sam'L"  
replace first="Thomas" if first=="Tho" 
replace first="Michael" if first=="Michl"


gen noname=(strpos(first, "?")>0 | strpos(first, "%")>0 | strpos(first, "-")>0 | strpos(first, "!")>0 | strpos(first, "*")>0)
replace noname=1 if missing(first)

tab noname
drop if noname==1
	
egen namegrp = group(first sex)

egen educ_pseudo = mean(educ_attain), by(namegrp)

/*** Properties of name distribution -- R2 from regression of actual education on mean education by name in "child" sample; 
share of ppl with top 50 name in "child" sample */

reg educ_attain educ_pseudo if sex==1
local rsq_men = e(r2)
di "R2 for men = `rsq_men'"

reg educ_attain educ_pseudo if sex==2
local rsq_women = e(r2)
di "R2 for women = `rsq_women'"

egen count_name = count(namegrp), by(namegrp)
egen tag_name = tag(namegrp)

gsort sex -count_name -tag_name

gen name_rank = 1 if sex!=sex[_n-1]
tab name_rank

replace name_rank = name_rank[_n-1] + tag_name if name_rank==.


gen top50 = name_rank<=50

count if top50==1 & sex==1
local n = r(N)
count if sex==1
local d = r(N)

local share50_men = `n'/`d'

di "Share men with top 50 name is `share50_men'"

count if tag_name==1 & sex==1
local n_names_men = r(N)

count if tag_name==1 & sex==1 & count_name>1
local n_names_men_g1 = r(N)

di "`n_names_men' male names in child sample, `n_names_men_g1' occur >1 time"

count if top50==1 & sex==2
local n = r(N)
count if sex==2
local d = r(N)

local share50_women = `n'/`d'

di "Share women with top 50 name is `share50_women'"

count if tag_name==1 & sex==2
local n_names_women = r(N)

count if tag_name==1 & sex==2 & count_name>1
local n_names_women_g1 = r(N)

di "`n_names_women' female names in child sample, `n_names_women_g1' occur >1 time"

keep if tag_name==1

keep sex first educ_pseudo 

save "$homedir/simulation_dataset_2_clean.dta", replace

**** Add pseudo education to "adult" dataset

use $homedir/simulation_dataset_2_adult.dta, clear

preserve
keep educ_attain_wife educ_attain_husb
drop if educ_attain_wife==. | educ_attain_husb==.
saveold "$homedir/dataset_adult_for_R_2.dta", replace version(12)
restore

foreach v in husb wife {
	gen str first_`v'=word(name_`v',1)
	replace first_`v'=subinstr(first_`v', ".", "", .)
	replace first_`v'=trim(first_`v')
	replace first_`v'=proper(first_`v')
	
	*** obvious abbreviations (not nicknames)
	replace first_`v'="William" if first_`v'=="Wm"
	replace first_`v'="George" if first_`v'=="Geo"
	replace first_`v'="Charles" if first_`v'=="Chas"
	replace first_`v'="Daniel" if first_`v'=="Danl"
	replace first_`v'="James" if first_`v'=="Jas"
	replace first_`v'="Joseph" if first_`v'=="Jos"
	replace first_`v'="Robert" if first_`v'=="Robt"
	replace first_`v'="Richard" if first_`v'=="Richd"
	replace first_`v'="Samuel" if first_`v'=="Saml"
	replace first_`v'="Thomas" if first_`v'=="Thos"
	replace first_`v'="Frederick" if first_`v'=="Fredk"
	replace first_`v'="Frederick" if first_`v'=="Fred'K" 
	replace first_`v'="John" if first_`v'=="Jno" 
	replace first_`v'="Samuel" if first_`v'=="Sam'L"  
	replace first_`v'="Thomas" if first_`v'=="Tho" 
	replace first_`v'="Michael" if first_`v'=="Michl"
}
	
gen first = first_husb
gen sex=1

mer m:1 sex first using "$homedir/simulation_dataset_2_clean.dta"
drop if _merge==2
drop _merge

drop first sex
rename educ_pseudo educ_pseudo_husb

gen first=first_wife
gen sex=2

mer m:1 sex first using "$homedir/simulation_dataset_2_clean.dta"
drop if _merge==2
drop _merge

drop first sex
rename educ_pseudo educ_pseudo_wife

xtile pseudo_qtile_husb = educ_pseudo_husb, nq(4)
xtile pseudo_qtile_wife = educ_pseudo_wife, nq(4)

xtile qtile_wife = educ_attain_wife, nq(4)

egen namegrp_wife = group(first_wife)
egen namegrp_husb = group(first_husb)

summ educ_pseudo_husb
local var_husb = (r(sd))^2
summ educ_pseudo_wife
local var_wife = (r(sd))^2

corr educ_pseudo_husb educ_pseudo_wife
local corr_coef = r(rho)

log using "$homedir/tableA-9.txt", replace

di "Correlation btw husband's & wife's pseudo eduation is `corr_coef'"
di "Variance of husband's pseudo education is `var_husb'"
di "Variance of wife's pseudo education is `var_wife'"
di "R-squared for men is `rsq_men'"
di "R-squared for women is `rsq_women'"
di "Share of men with top 50 name is `share50_men'"
di "Share of women with top 50 name is `share50_women'"

log close



***** Correlation coefficients

log using "$homedir/tableA-3.txt", replace
corr educ_attain_husb educ_attain_wife

corr educ_pseudo_husb educ_pseudo_wife

log close

reg educ_attain_husb i.qtile_wife 
preserve
keep if e(sample)
keep educ_attain_husb educ_attain_wife qtile_wife namegrp_wife namegrp_husb
gen pseudo=0
save "$homedir/temp_file_2.dta", replace
restore


reg educ_pseudo_husb i.pseudo_qtile_wife
preserve
keep if e(sample)
keep educ_pseudo_husb educ_pseudo_wife pseudo_qtile_wife namegrp_wife namegrp_husb
gen pseudo=1
rename educ_pseudo_husb educ_attain_husb
rename pseudo_qtile_wife qtile_wife
rename educ_pseudo_wife educ_attain_wife
append using "$homedir/temp_file_2.dta"
save "$homedir/temp_file_2.dta", replace
restore


clus_nway regress educ_pseudo_husb i.pseudo_qtile_wife, vce(cluster namegrp_wife namegrp_husb)

egen tag_name_wife = tag(first_wife) if educ_pseudo_wife!=.
egen tag_name_husb = tag(first_husb) if educ_pseudo_husb!=.

count if tag_name_wife==1
count if tag_name_husb==1

save "$homedir/simulation_dataset_2_clean.dta", replace

use "$homedir/temp_file_2.dta", clear



gen q1 = qtile_wife==1
gen q2 = qtile_wife==2
gen q3 = qtile_wife==3
gen q4 = qtile_wife==4

forvalues i=1/4 {
	gen pseudo_q`i' = q`i'*pseudo
}

log using "$homedir/tableA-3.txt", append

***** Regressions separately for actual & pseudo to recover coefficients
clus_nway regress educ_attain_husb i.qtile_wife if pseudo==0, vce(cluster namegrp_wife namegrp_husb)
clus_nway regress educ_attain_husb i.qtile_wife if pseudo==1, vce(cluster namegrp_wife namegrp_husb)


*** Regressions combining actual & pseudo to recover ratio of actual to pseudo coefficient
clus_nway regress educ_attain_husb q2 q3 q4 pseudo pseudo_q2 pseudo_q3 pseudo_q4, vce(cluster namegrp_wife namegrp_husb)

**** 2nd quartile ratio: beta_1/(beta_1 + beta_5)
matrix b = e(b)
matrix vcv=e(V)

local rat2 = b[1,1]/(b[1,1] + b[1,5])
di `rat2'
local multa = (b[1,5])/(b[1,1] + b[1,5])^2
local multb = (-b[1,1])/(b[1,1] + b[1,5])^2


local var2 = ((`multa')^2)*vcv[1,1] + ((`multb')^2)*vcv[5,5] + 2*`multa'*`multb'*vcv[5,1]
local se2 = sqrt(`var2')
di `se2'

di `rat2' - 1.96*`se2'
di `rat2' + 1.96*`se2'

***** 3rd quartile ratio: beta_2/(beta_2 + beta_6)

local rat3 = b[1,2]/(b[1,2] + b[1,6])
di `rat3'
local multa = (b[1,6])/(b[1,2] + b[1,6])^2
local multb = (-b[1,2])/(b[1,2] + b[1,6])^2


local var3 = ((`multa')^2)*vcv[2,2] + ((`multb')^2)*vcv[6,6] + 2*`multa'*`multb'*vcv[6,2]
local se3 = sqrt(`var3')
di `se3'

di `rat3' - 1.96*`se3'
di `rat3' + 1.96*`se3'

**** 4th quartile ratio: beta_3/(beta_3 + beta_7)

local rat4 = b[1,3]/(b[1,3] + b[1,7])
di `rat4'
local multa = (b[1,7])/(b[1,3] + b[1,7])^2
local multb = (-b[1,3])/(b[1,3] + b[1,7])^2


local var4 = ((`multa')^2)*vcv[3,3] + ((`multb')^2)*vcv[7,7] + 2*`multa'*`multb'*vcv[7,3]
local se4 = sqrt(`var4')
di `se4'

di `rat4' - 1.96*`se4'
di `rat4' + 1.96*`se4'

****** Continuous -- regression of husband's income on wife's income and vice versa

**** Separate continuous regressions to recover coefficients and standard errors

clus_nway regress educ_attain_wife educ_attain_husb if pseudo==0, vce(cluster namegrp_wife namegrp_husb) 
clus_nway regress educ_attain_wife educ_attain_husb if pseudo==1, vce(cluster namegrp_wife namegrp_husb) 

clus_nway regress educ_attain_husb educ_attain_wife if pseudo==0, vce(cluster namegrp_wife namegrp_husb) 
clus_nway regress educ_attain_husb educ_attain_wife if pseudo==1, vce(cluster namegrp_wife namegrp_husb) 


******* Combined regressions to recover ratios of coefficients and standard errors or ratios

gen inter_wife = pseudo*educ_attain_wife
gen inter_husb = pseudo*educ_attain_husb

** husb on wife
clus_nway regress educ_attain_husb educ_attain_wife pseudo inter_wife, vce(cluster namegrp_wife namegrp_husb)

matrix b = e(b)
matrix vcv=e(V)

local rat2 = b[1,1]/(b[1,1] + b[1,3])
di `rat2'
local multa = (b[1,3])/(b[1,1] + b[1,3])^2
local multb = (-b[1,1])/(b[1,1] + b[1,3])^2


local var2 = ((`multa')^2)*vcv[1,1] + ((`multb')^2)*vcv[3,3] + 2*`multa'*`multb'*vcv[3,1]
local se2 = sqrt(`var2')
di `se2'

di `rat2' - 1.96*`se2'
di `rat2' + 1.96*`se2'

***wife on husb

clus_nway regress educ_attain_wife educ_attain_husb pseudo inter_husb, vce(cluster namegrp_wife namegrp_husb)

matrix b = e(b)
matrix vcv=e(V)

local rat2 = b[1,1]/(b[1,1] + b[1,3])
di `rat2'
local multa = (b[1,3])/(b[1,1] + b[1,3])^2
local multb = (-b[1,1])/(b[1,1] + b[1,3])^2


local var2 = ((`multa')^2)*vcv[1,1] + ((`multb')^2)*vcv[3,3] + 2*`multa'*`multb'*vcv[3,1]
local se2 = sqrt(`var2')
di `se2'

di `rat2' - 1.96*`se2'
di `rat2' + 1.96*`se2'


log close

erase "$homedir/simulation_dataset_2_clean.dta"
erase "$homedir/temp_file_2.dta"
